www.gusucode.com > VC++ 编写的C语言的词法分析源码程序 > VC++ 编写的C语言的词法分析源码程序/code/lexer/lexer.cpp
/* * C语言的词法分析程序 * */ /* 例子程序: #include <stdio.h> #include <stdlib.h> int main(int argc, char *argv[]) { int a_value = 52357; int b_value = 0x1234abcd; long long c_value = 1234LL; unsigned int d_value = 1234U; long e_value = 5464L; unsigned long f_value = 6384762UL; double g_value = 0.1234; char j_value = 'a'; char k_value = '\n'; char *l_value = "a string \""; a_value += 1; b_value = 1 + 1 * 2 / 3 % 4; while (true) { if (1) break; } printf("%d", a_value); return 0; } */ #include <iostream> #include <fstream> #include <cstdlib> #include <cctype> #include <vector> using namespace std; /* * 词的类型 */ enum WordType { eKey, // 关键字 eIdent, // 标记符 eFloat, // 浮点型 eFloatScie, // 科学计数法的浮点数 eIntDec, // 整数 eIntUnsigned, // 无符号整数 eIntHex, // 16进制表示的整数 eLong, // 长整型 eLongUnsigned, // 无符号长整型 eLongLong, // long long类型 eChar, // 字符 eString, // 字符串 eMacro, // 宏指示符 eOperator, // 操作符 eDelimiter, // 界符 eInclude, // include语句 eHeader, // 头文件 eError // 错误格式 }; /* * 用于存放词类型,及其说明 */ struct WordTypeList { WordType type; string text; }; /* * 词类型的列表,用于输出词的说明文字 */ struct WordTypeList wordTypeList[] = { {eKey, "关键字"}, {eIdent, "标记符"}, {eFloat, "浮点型"}, {eFloatScie, "科学计数法的浮点数"}, {eIntDec, "整数"}, {eIntUnsigned, "无符号整数"}, {eIntHex, "16进制表示的整数"}, {eLong, "长整型"}, {eLongUnsigned, "无符号长整型"}, {eLongLong, "long long类型"}, {eChar, "字符"}, {eString, "字符串"}, {eMacro, "宏指示符"}, {eOperator, "操作符"}, {eDelimiter, "界符"}, {eInclude, "include语句"}, {eHeader, "头文件"}, {eError, "错误格式"} }; /* * C语言关键字 */ string keyWordList[] = { "auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "extern", "float", "for", "goto", "if", "int", "long", "register", "return", "short", "signed", "static", "sizeof", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while", "" }; /* * 判断theWord是否是关键字, * 如果是返回ture,否则返回false */ bool IsKeyWord(const string &theWord) { // 遍历关键字列表,一个一个比较 for (int i = 0; !keyWordList[i].empty(); ++i) { if (theWord.compare(keyWordList[i]) == 0) return true; } return false; } /* * 获取从pos开始的第一个非空字符的位置, * 空格,制表符,换行都会被忽略。 */ int GetFirstNonNullChar(const string &str, int pos) { while (pos < (int)str.length()) { if (!isspace(str[pos])) { return pos; } ++pos; } return -1; } /* * 获取位置为pos的字符的下一个字符。 * 如果到达字符串的结尾,就返回一个为空的string */ string GetNextChar(const string &str, int pos) { string theChar; int end = str.length(); // pos不应该小于0 if (pos < 0) { return theChar; } ++pos; if (pos < end) { theChar.push_back(str[pos]); } return theChar; } /* * 获取从pos开始下一个边界字符所在的位置 */ int GetBoundary(const string &str, int pos) { string theChar; char ch; while (true) { theChar = GetNextChar(str, pos); if (theChar.empty()) { return -1; } ch = theChar[0]; if (!isalpha(ch) || !isdigit(ch) || ch != '_') { return pos; } ++pos; } } /* * 分析标记符 */ string LexIdentifier(const string &fileText, int wordBegin, int &wordEnd) { string theWord; string nextCh; char ch = fileText[wordBegin]; wordEnd = wordBegin; // 标记符必须以英文或者下划线开头 if (isalpha(ch) || ch == '_') { do { theWord.push_back(ch); nextCh = GetNextChar(fileText, wordEnd); if (nextCh.empty()) { break; } ch = nextCh[0]; ++wordEnd; } while (isalpha(ch) || ch == '_'|| isdigit(ch)); } // wordEnd要指向标记符的最后一个字符的位置 --wordEnd; return theWord; } /* * 分析头文件 * * 支持#include <stdio.h>或者#include "stdio.h", * wordBegin必须指向头文件名的前一字符,即'<'或者'"'。 * 函数返回后,wordEnd指向头文件名的最后一个字符。 * 函数返回值string包含头文件的名字。 */ string LexHeader(const string &fileText, int wordBegin, int &wordEnd) { string theWord; char ch; string nextCh; wordEnd = wordBegin; while (true) { nextCh = GetNextChar(fileText, wordEnd); if (nextCh.empty()) { break; } ch = nextCh[0]; ++wordEnd; if (ch == '>' || ch == '"') { break; } theWord.push_back(ch); } --wordEnd; return theWord; } /* * 分析数值 */ string LexNumeric(const string &fileText, int wordBegin, int &wordEnd, WordType &type) { string theNum; char ch = fileText[wordBegin]; char upperCh; string nextCh; wordEnd = wordBegin; if (isdigit(ch)) { theNum.push_back(ch); nextCh = GetNextChar(fileText, wordEnd); if (nextCh.empty()) { goto out; } ++wordEnd; ch = nextCh[0]; // 16进制的整数 if (toupper(ch) == 'X') { do { theNum.push_back(ch); nextCh = GetNextChar(fileText, wordEnd); if (nextCh.empty()) { goto out; } ++wordEnd; ch = nextCh[0]; upperCh = toupper(ch); } while ((upperCh >= '0' && upperCh <= '9') || (upperCh >= 'A' && upperCh <= 'F')); // 检查整个符号串 int boundary = GetBoundary(fileText, wordEnd); // 注意:wordEnd这里实际已经指向数字串的最后一个字符的后一个字符 if (wordEnd != boundary) { // 标记错误,并获取整个字符串 type = eError; string part = fileText.substr(wordEnd, boundary - wordEnd); theNum.append(part); } type = eIntHex; --wordEnd; return theNum; } // 浮点数和整数 int cDot = 0; while (isdigit(ch) || ch == '.') { if (ch == '.') { ++cDot; if (cDot > 1) { goto out; } type = eFloat; } theNum.push_back(ch); nextCh = GetNextChar(fileText, wordEnd); if (nextCh.empty()) { goto out; } ch = nextCh[0]; ++wordEnd; } if (cDot == 0) { type = eIntDec; } if (isalpha(ch)) { if (cDot != 0) { goto out; } upperCh = toupper(ch); if (upperCh == 'L') { theNum.push_back(ch); nextCh = GetNextChar(fileText, wordEnd); if (nextCh.empty()) { goto out; } ch = nextCh[0]; ++wordEnd; if (isalpha(ch)) { upperCh = toupper(ch); theNum.push_back(ch); if (upperCh == 'L') { type = eLongLong; } else { type = eError; goto out; } } else { --wordEnd; type = eLong; } } else if (upperCh == 'U') { theNum.push_back(ch); nextCh = GetNextChar(fileText, wordEnd); if (nextCh.empty()) { goto out; } ch = nextCh[0]; ++wordEnd; if (isalpha(ch)) { upperCh = toupper(ch); theNum.push_back(ch); if (upperCh == 'L') { type = eLongUnsigned; } else { type = eError; goto out; } } else { --wordEnd; type = eIntUnsigned; } } else { goto out; } } else { --wordEnd; } return theNum; } // ERROR out: type = eError; return theNum; } /* * 分析字符串 */ string LexString(const string &fileText, int wordBegin, int &wordEnd) { string theString; char ch; char prevCh; string nextCh; wordEnd = wordBegin; nextCh = GetNextChar(fileText, wordEnd); if (nextCh.empty()) { goto out; } ++wordEnd; ch = nextCh[0]; do { theString.push_back(ch); nextCh = GetNextChar(fileText, wordEnd); if (nextCh.empty()) { goto out; } ++wordEnd; prevCh = ch; ch = nextCh[0]; if (ch == '"' && prevCh != '\\') { break; } } while (true); out: --wordEnd; return theString; } /* * 分析字符 */ string LexChar(const string &fileText, int wordBegin, int &wordEnd) { string theString; int end = fileText.length(); string nextCh; char ch; char prevCh; wordEnd = wordBegin; nextCh = GetNextChar(fileText, wordEnd); if (nextCh.empty()) { goto out; } ++wordEnd; ch = nextCh[0]; do { theString.push_back(ch); nextCh = GetNextChar(fileText, wordEnd); if (nextCh.empty()) { goto out; } ++wordEnd; prevCh = ch; ch = nextCh[0]; if (ch == '\'' && prevCh != '\\') { break; } } while (true); out: --wordEnd; return theString; } /* * 读取代码文件 */ int ReadSourceFile(const string &filePath, string &fileTextBuf) { ifstream ifs(filePath.c_str()); int length; char *buf; if (!ifs.good()) { return -1; } // 根据文件的大小分配内存 ifs.seekg(0, ios::end); length = (int)ifs.tellg(); ifs.seekg(0, ios::beg); buf = new char[length + 1]; ifs.getline(buf, length, EOF); buf[length] = '\0'; fileTextBuf = buf; delete [] buf; ifs.close(); return 0; } /* * 将结果写filePath指向的文件 */ void WriteParseResult(vector<pair<string, WordType>> &resultTable, const string &filePath) { ofstream ofs(filePath.c_str()); vector<pair<string, WordType>>::iterator iter; // 遍历存放结果的resultTable,并将结果写入文件 for (iter = resultTable.begin(); iter < resultTable.end(); ++iter) { ofs << (*iter).first.c_str() << "\t\t\t"; ofs << wordTypeList[(*iter).second].text.c_str() << endl; } ofs.close(); } /* * 词法分析的主要实现部分 */ void Parser(vector<pair<string, WordType>> &resultTable, const string fileText) { int currentPos; int end; char ch; char chNext; string theWord; currentPos = 0; end = fileText.length(); while (true) { currentPos = GetFirstNonNullChar(fileText, currentPos); if (currentPos >= end || currentPos < 0) { break; } ch = fileText[currentPos]; if (isalpha(ch) || ch == '_') { theWord = LexIdentifier(fileText, currentPos, currentPos); if (IsKeyWord(theWord)) { resultTable.push_back(pair<string, WordType>(theWord, eKey)); } else { resultTable.push_back(pair<string, WordType>(theWord, eIdent)); } theWord.clear(); } else if (isdigit(ch)) { WordType type; theWord = LexNumeric(fileText, currentPos, currentPos, type); resultTable.push_back(pair<string, WordType>(theWord, type)); theWord.clear(); } else if (ch == '+') { chNext = fileText[currentPos + 1]; if (chNext == '=') { resultTable.push_back(pair<string, WordType>("+=", eOperator)); ++currentPos; if (currentPos >= end || currentPos < 0) { break; } } else if (chNext == '+') { resultTable.push_back(pair<string, WordType>("++", eOperator)); ++currentPos; if (currentPos >= end || currentPos < 0) { break; } } else { resultTable.push_back(pair<string, WordType>("+", eOperator)); } } else if (ch == '-') { if (currentPos+1 >= end || currentPos+1 < 0) { break; } chNext = fileText[currentPos + 1]; if (chNext == '=') { resultTable.push_back(pair<string, WordType>("-=", eOperator)); ++currentPos; if (currentPos >= end || currentPos < 0) { break; } } else if (chNext == '-') { resultTable.push_back(pair<string, WordType>("--", eOperator)); ++currentPos; if (currentPos >= end || currentPos < 0) { break; } } else { resultTable.push_back(pair<string, WordType>("-", eOperator)); } } else if (ch == '*') { if (currentPos+1 >= end || currentPos+1 < 0) { break; } chNext = fileText[currentPos + 1]; if (chNext == '=') { resultTable.push_back(pair<string, WordType>("*=", eOperator)); ++currentPos; if (currentPos >= end || currentPos < 0) { break; } } else { resultTable.push_back(pair<string, WordType>("*", eOperator)); } } else if (ch == '/') { if (currentPos+1 >= end || currentPos+1 < 0) { break; } chNext = fileText[currentPos + 1]; if (chNext == '=') { resultTable.push_back(pair<string, WordType>("/=", eOperator)); ++currentPos; if (currentPos >= end || currentPos < 0) { break; } } else if (chNext == '/') { currentPos += 2; if (currentPos+2 >= end || currentPos+2 < 0) { ++currentPos; break; } while (fileText[currentPos] != '\n') { ++currentPos; if (currentPos >= end || currentPos < 0) { break; } } } else if (chNext == '*') { currentPos += 2; if (currentPos+2 >= end || currentPos+2 < 0) { break; } while (true) { if (currentPos+1 >= end || currentPos+1 < 0) { break; } if (fileText[currentPos] == '*' && fileText[currentPos + 1] == '/') { break; } ++currentPos; if (currentPos >= end || currentPos < 0) { break; } } } else { resultTable.push_back(pair<string, WordType>("/", eOperator)); } } else if (ch == '=') { if (currentPos+1 >= end || currentPos+1 < 0) { break; } chNext = fileText[currentPos + 1]; if (chNext == '=') { resultTable.push_back(pair<string, WordType>("==", eOperator)); ++currentPos; if (currentPos >= end || currentPos < 0) { break; } } else { resultTable.push_back(pair<string, WordType>("=", eOperator)); } } else if (ch == '<') { if (currentPos+1 >= end || currentPos+1 < 0) { break; } chNext = fileText[currentPos + 1]; if (chNext == '=') { resultTable.push_back(pair<string, WordType>("<=", eOperator)); ++currentPos; if (currentPos >= end || currentPos < 0) { break; } } else { resultTable.push_back(pair<string, WordType>("<", eOperator)); } } else if (ch == '>') { if (currentPos+1 >= end || currentPos+1 < 0) { break; } chNext = fileText[currentPos + 1]; if (chNext == '=') { resultTable.push_back(pair<string, WordType>(">=", eOperator)); ++currentPos; if (currentPos >= end || currentPos < 0) { break; } } else { resultTable.push_back(pair<string, WordType>(">", eOperator)); } } else if (ch == '%') { resultTable.push_back(pair<string, WordType>(">", eOperator)); } else if (ch == '(') { resultTable.push_back(pair<string, WordType>("(", eOperator)); } else if (ch == ')') { resultTable.push_back(pair<string, WordType>(")", eOperator)); } else if (ch == '{') { resultTable.push_back(pair<string, WordType>("{", eDelimiter)); } else if (ch == '}') { resultTable.push_back(pair<string, WordType>("}", eDelimiter)); } else if (ch == '[') { resultTable.push_back(pair<string, WordType>("[", eOperator)); } else if (ch == ']') { resultTable.push_back(pair<string, WordType>("]", eOperator)); } else if (ch == ',') { resultTable.push_back(pair<string, WordType>(",", eOperator)); } else if (ch == ';') { resultTable.push_back(pair<string, WordType>(";", eDelimiter)); } else if (ch == '"') { theWord = LexString(fileText, currentPos, currentPos); resultTable.push_back(pair<string, WordType>("\"", eDelimiter)); resultTable.push_back(pair<string, WordType>(theWord, eString)); ++currentPos; resultTable.push_back(pair<string, WordType>("\"", eDelimiter)); } else if (ch == '\'') { theWord = LexChar(fileText, currentPos, currentPos); resultTable.push_back(pair<string, WordType>("'", eDelimiter)); resultTable.push_back(pair<string, WordType>(theWord, eChar)); ++currentPos; resultTable.push_back(pair<string, WordType>("'", eDelimiter)); } else if (ch == '.') { resultTable.push_back(pair<string, WordType>(".", eDelimiter)); } else if (ch == '#') { resultTable.push_back(pair<string, WordType>("#", eMacro)); ++currentPos; currentPos = GetFirstNonNullChar(fileText, currentPos); if (currentPos >= end || currentPos < 0) { break; } theWord = LexIdentifier(fileText, currentPos, currentPos); if (theWord.compare("include") == 0) { resultTable.push_back(pair<string, WordType>(theWord, eInclude)); ++currentPos; currentPos = GetFirstNonNullChar(fileText, currentPos); if (currentPos >= end || currentPos < 0) { break; } ch = fileText[currentPos]; if (ch == '<' || ch == '"') { theWord = ch; resultTable.push_back(pair<string, WordType>(theWord, eDelimiter)); theWord = LexHeader(fileText, currentPos, currentPos); resultTable.push_back(pair<string, WordType>(theWord, eHeader)); ++currentPos; ch = fileText[currentPos]; theWord = ch; resultTable.push_back(pair<string, WordType>(theWord, eDelimiter)); } } else { } theWord.clear(); } else { } ++currentPos; } } int main(int argc, char *argv[]) { if (argc != 2) { cerr << "命令使用错误" << endl; cerr << "使用方法: lexer <源代码文件名>" << endl; exit(1); } string fileTextBuf; string filePath(argv[1]); int retval; retval = ReadSourceFile(filePath, fileTextBuf); if (retval == -1) { cerr << "文件打开错误,请确保文件路径正确" << endl; exit(1); } vector<pair<string, WordType>> resultTable; Parser(resultTable, fileTextBuf); WriteParseResult(resultTable, filePath.append(".output.txt")); return 0; }